{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pickle\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from Ad_Utils import raw_data_path,feature_data_path,result_path,cache_pkl_path,dump_pickle,load_pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gen_global_index():\n",
    "    train = pd.read_csv(raw_data_path+'train.csv')\n",
    "    test = pd.read_csv(raw_data_path+'test.csv')\n",
    "    all_data = train.append(test)\n",
    "    all_data['global_index'] = np.arange(0,all_data.shape[0])\n",
    "    train = all_data.iloc[0:train.shape[0],:]\n",
    "    test = all_data.iloc[train.shape[0]:,:]\n",
    "    dump_pickle(train,raw_data_path+'train.pkl')\n",
    "    dump_pickle(test,raw_data_path+'test.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def csv_pkl(csv_name_without_suffix,protocol=None):\n",
    "    pkl_path = raw_data_path+csv_name_without_suffix +'.pkl'\n",
    "    if not os.path.exists(pkl_path):\n",
    "        print('generating '+pkl_path)\n",
    "        data = pd.read_csv(raw_data_path+csv_name_without_suffix+'.csv')\n",
    "        dump_pickle(data,pkl_path,protocol=protocol)\n",
    "    else:\n",
    "        print('found '+pkl_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gen_demo_result():\n",
    "    test = pd.read_csv(raw_data_path+'test.csv')\n",
    "    test = test[['instanceID','label']]\n",
    "    test.rename(columns={'label':'prob'},inplace=True)\n",
    "    if not os.path.exists(result_path):\n",
    "        os.mkdir(result_path)\n",
    "    test.to_csv(result_path+'demo_result.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found ../raw_data/ad.pkl\n",
      "found ../raw_data/position.pkl\n",
      "found ../raw_data/app_categories.pkl\n",
      "found ../raw_data/test.pkl\n",
      "found ../raw_data/user_app_actions.pkl\n",
      "found ../raw_data/user.pkl\n",
      "generating ../raw_data/user_installedapps.pkl\n"
     ]
    }
   ],
   "source": [
    "if __name__ == '__main__':\n",
    "    gen_global_index()\n",
    "    train = load_pickle(raw_data_path+'train.pkl')\n",
    "    train = train[train.clickTime>=17000000]#丢弃16号的数据\n",
    "    dump_pickle(train,raw_data_path+'train.pkl')\n",
    "    \n",
    "    csv_pkl('ad')\n",
    "    csv_pkl('position')\n",
    "    csv_pkl('app_categories')\n",
    "    csv_pkl('test')\n",
    "    csv_pkl('user_app_actions')\n",
    "    csv_pkl('user')\n",
    "    csv_pkl('user_installedapps',protocol=4)\n",
    "    \n",
    "    gen_demo_result()\n",
    "    \n",
    "    if not os.path.exists(feature_data_path):\n",
    "        os.mkdir(feature_data_path)\n",
    "    if not os.path.exists(cache_pkl_path):\n",
    "        os.mkdir(cache_pkl_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
